library(dummies)
dummies-1.5.6 provided by Decision Patterns
Warning message:
package ‘gplots’ was built under R version 3.5.2
library(ltm)
Loading required package: MASS
Loading required package: msm
Loading required package: polycor
library(dplyr)
package ‘dplyr’ was built under R version 3.5.2
Attaching package: ‘dplyr’
The following object is masked from ‘package:MASS’:
select
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(readxl)
package ‘readxl’ was built under R version 3.5.2
library(psych)
package ‘psych’ was built under R version 3.5.2
Attaching package: ‘psych’
The following object is masked from ‘package:ltm’:
factor.scores
The following object is masked from ‘package:polycor’:
polyserial
require(MASS)
library(FNN)
package ‘FNN’ was built under R version 3.5.2
library(adabag)
Loading required package: rpart
Loading required package: caret
Loading required package: lattice
Loading required package: ggplot2
Attaching package: ‘ggplot2’
The following objects are masked from ‘package:psych’:
%+%, alpha
Loading required package: foreach
Loading required package: doParallel
Loading required package: iterators
Loading required package: parallel
Attaching package: ‘adabag’
The following object is masked from ‘package:ltm’:
margins
library(rpart)
library(caret)
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:ggplot2’:
margin
The following object is masked from ‘package:psych’:
outlier
The following object is masked from ‘package:dplyr’:
combine
library(party)
package ‘party’ was built under R version 3.5.2Loading required package: grid
Loading required package: mvtnorm
package ‘mvtnorm’ was built under R version 3.5.2Loading required package: modeltools
Loading required package: stats4
Loading required package: strucchange
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Loading required package: sandwich
library(ROCR)
library(ggplot2)
library(rpart.plot)
nrow(bank)
[1] 45211
sum(is.na(bank))
[1] 0
success <- bank[bank$y == "yes",]
success
nrow(success)
[1] 5289
failure <- bank[bank$y == "no",]
failure
nrow(failure)
[1] 39922
summary(bank_data)
age job marital education default
Min. :18.00 management :1170 divorced: 594 primary : 688 no :5222
1st Qu.:32.00 blue-collar: 957 married :3023 secondary:2618 yes: 66
Median :39.00 technician : 870 single :1671 tertiary :1728
Mean :41.49 admin. : 613 unknown : 254
3rd Qu.:49.00 services : 440
Max. :95.00 retired : 390
(Other) : 848
balance housing loan contact day month
Min. :-3058 no :2792 no :4581 cellular :3801 Min. : 1.00 may :1346
1st Qu.: 133 yes:2496 yes: 707 telephone: 393 1st Qu.: 8.00 jul : 749
Median : 577 unknown :1094 Median :15.00 aug : 695
Mean : 1571 Mean :15.47 jun : 577
3rd Qu.: 1762 3rd Qu.:21.00 apr : 441
Max. :81204 Max. :31.00 nov : 405
(Other):1075
duration campaign pdays previous poutcome y
Min. : 2 Min. : 1.000 Min. : -1.00 Min. : 0.0000 failure: 580 no :2644
1st Qu.: 139 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 other : 269 yes:2644
Median : 256 Median : 2.000 Median : -1.00 Median : 0.0000 success: 545
Mean : 374 Mean : 2.547 Mean : 52.76 Mean : 0.8907 unknown:3894
3rd Qu.: 494 3rd Qu.: 3.000 3rd Qu.: 78.00 3rd Qu.: 1.0000
Max. :3881 Max. :41.000 Max. :871.00 Max. :58.0000
cbind(bank_data,dummy(bank_data$job, sep = "_"))
job_dummy <- dummy(bank_data$job, sep = "_")
job_dummy
job_admin. job_blue-collar job_entrepreneur job_housemaid job_management job_retired
[1,] 0 0 0 0 0 0
[2,] 1 0 0 0 0 0
[3,] 0 0 0 0 1 0
[4,] 0 0 0 0 0 0
[5,] 0 0 0 0 0 0
[6,] 0 0 0 0 0 0
[7,] 0 0 0 0 0 0
[8,] 0 0 0 0 0 1
[9,] 1 0 0 0 0 0
[10,] 1 0 0 0 0 0
[11,] 1 0 0 0 0 0
[12,] 0 0 0 0 1 0
[13,] 0 0 0 0 1 0
[14,] 0 0 0 0 0 1
[15,] 0 0 0 0 1 0
[16,] 0 0 0 0 1 0
[17,] 0 0 0 0 0 0
[18,] 0 0 0 0 1 0
[19,] 0 0 0 0 0 0
[20,] 0 0 0 0 0 0
[21,] 0 0 0 0 0 0
[22,] 0 0 0 0 1 0
[23,] 0 0 0 0 0 0
[24,] 1 0 0 0 0 0
[25,] 1 0 0 0 0 0
[26,] 1 0 0 0 0 0
[27,] 0 1 0 0 0 0
[28,] 0 0 0 0 1 0
[29,] 0 0 0 0 0 0
[30,] 0 0 0 0 0 1
[31,] 0 0 0 0 0 0
[32,] 0 0 0 0 0 0
[33,] 0 1 0 0 0 0
[34,] 0 0 0 0 1 0
[35,] 0 0 0 0 0 1
[36,] 0 0 0 0 1 0
[37,] 0 0 0 0 1 0
[38,] 0 0 1 0 0 0
[39,] 0 1 0 0 0 0
[40,] 0 0 0 0 0 0
[41,] 1 0 0 0 0 0
[42,] 0 0 0 0 0 1
[43,] 0 0 0 0 0 0
[44,] 0 0 0 0 0 0
[45,] 0 0 0 0 0 0
[46,] 0 1 0 0 0 0
[47,] 1 0 0 0 0 0
[48,] 0 0 0 1 0 0
[49,] 1 0 0 0 0 0
[50,] 0 0 0 0 0 1
[51,] 0 0 0 0 0 0
[52,] 0 0 0 0 1 0
[53,] 0 1 0 0 0 0
[54,] 0 0 0 0 1 0
[55,] 0 1 0 0 0 0
[56,] 0 0 0 0 0 0
[57,] 0 1 0 0 0 0
[58,] 1 0 0 0 0 0
[59,] 0 0 0 0 1 0
[60,] 0 0 0 0 1 0
[61,] 0 0 0 0 0 1
[62,] 0 0 0 0 1 0
[63,] 0 0 0 0 0 0
[64,] 0 0 0 0 0 0
[65,] 0 0 0 0 1 0
[66,] 0 0 0 0 1 0
[67,] 0 1 0 0 0 0
[68,] 0 0 0 0 0 1
[69,] 0 1 0 0 0 0
[70,] 0 0 0 0 1 0
[71,] 1 0 0 0 0 0
[72,] 0 0 0 0 0 0
[73,] 1 0 0 0 0 0
[74,] 0 0 0 0 1 0
[75,] 0 0 0 0 1 0
[76,] 0 0 0 0 0 0
[77,] 0 0 0 0 0 0
[78,] 0 0 0 0 0 1
[79,] 0 0 0 0 0 0
[80,] 0 0 0 0 0 0
[81,] 0 0 0 0 1 0
[82,] 0 0 0 0 1 0
[83,] 0 0 0 0 0 1
job_self-employed job_services job_student job_technician job_unemployed job_unknown
[1,] 0 0 0 0 1 0
[2,] 0 0 0 0 0 0
[3,] 0 0 0 0 0 0
[4,] 0 0 1 0 0 0
[5,] 0 0 0 1 0 0
[6,] 0 0 0 0 0 1
[7,] 0 0 0 0 1 0
[8,] 0 0 0 0 0 0
[9,] 0 0 0 0 0 0
[10,] 0 0 0 0 0 0
[11,] 0 0 0 0 0 0
[12,] 0 0 0 0 0 0
[13,] 0 0 0 0 0 0
[14,] 0 0 0 0 0 0
[15,] 0 0 0 0 0 0
[16,] 0 0 0 0 0 0
[17,] 0 0 1 0 0 0
[18,] 0 0 0 0 0 0
[19,] 0 0 0 1 0 0
[20,] 0 0 1 0 0 0
[21,] 0 0 0 1 0 0
[22,] 0 0 0 0 0 0
[23,] 0 0 0 1 0 0
[24,] 0 0 0 0 0 0
[25,] 0 0 0 0 0 0
[26,] 0 0 0 0 0 0
[27,] 0 0 0 0 0 0
[28,] 0 0 0 0 0 0
[29,] 0 0 0 1 0 0
[30,] 0 0 0 0 0 0
[31,] 0 1 0 0 0 0
[32,] 0 0 0 0 0 1
[33,] 0 0 0 0 0 0
[34,] 0 0 0 0 0 0
[35,] 0 0 0 0 0 0
[36,] 0 0 0 0 0 0
[37,] 0 0 0 0 0 0
[38,] 0 0 0 0 0 0
[39,] 0 0 0 0 0 0
[40,] 0 0 0 1 0 0
[41,] 0 0 0 0 0 0
[42,] 0 0 0 0 0 0
[43,] 0 0 0 0 1 0
[44,] 0 0 0 1 0 0
[45,] 0 0 1 0 0 0
[46,] 0 0 0 0 0 0
[47,] 0 0 0 0 0 0
[48,] 0 0 0 0 0 0
[49,] 0 0 0 0 0 0
[50,] 0 0 0 0 0 0
[51,] 0 0 0 1 0 0
[52,] 0 0 0 0 0 0
[53,] 0 0 0 0 0 0
[54,] 0 0 0 0 0 0
[55,] 0 0 0 0 0 0
[56,] 0 1 0 0 0 0
[57,] 0 0 0 0 0 0
[58,] 0 0 0 0 0 0
[59,] 0 0 0 0 0 0
[60,] 0 0 0 0 0 0
[61,] 0 0 0 0 0 0
[62,] 0 0 0 0 0 0
[63,] 0 1 0 0 0 0
[64,] 0 0 0 0 1 0
[65,] 0 0 0 0 0 0
[66,] 0 0 0 0 0 0
[67,] 0 0 0 0 0 0
[68,] 0 0 0 0 0 0
[69,] 0 0 0 0 0 0
[70,] 0 0 0 0 0 0
[71,] 0 0 0 0 0 0
[72,] 0 0 0 1 0 0
[73,] 0 0 0 0 0 0
[74,] 0 0 0 0 0 0
[75,] 0 0 0 0 0 0
[76,] 1 0 0 0 0 0
[77,] 0 1 0 0 0 0
[78,] 0 0 0 0 0 0
[79,] 0 0 0 1 0 0
[80,] 0 0 0 1 0 0
[81,] 0 0 0 0 0 0
[82,] 0 0 0 0 0 0
[83,] 0 0 0 0 0 0
[ reached getOption("max.print") -- omitted 5205 rows ]
bank_data$job <- NULL
bank_data
cbind(bank_data,dummy(bank_data$marital, sep = "_"))
marital_dummy <- dummy(bank_data$marital, sep = "_")
marital_dummy
marital_divorced marital_married marital_single
[1,] 0 0 1
[2,] 0 0 1
[3,] 0 1 0
[4,] 0 0 1
[5,] 0 1 0
[6,] 0 1 0
[7,] 0 1 0
[8,] 0 1 0
[9,] 0 1 0
[10,] 0 0 1
[11,] 0 1 0
[12,] 0 1 0
[13,] 0 1 0
[14,] 0 1 0
[15,] 0 1 0
[16,] 0 1 0
[17,] 0 0 1
[18,] 0 0 1
[19,] 0 0 1
[20,] 0 0 1
[21,] 0 0 1
[22,] 1 0 0
[23,] 0 1 0
[24,] 1 0 0
[25,] 0 0 1
[26,] 1 0 0
[27,] 0 1 0
[28,] 0 1 0
[29,] 0 0 1
[30,] 1 0 0
[31,] 1 0 0
[32,] 0 1 0
[33,] 0 1 0
[34,] 0 1 0
[35,] 1 0 0
[36,] 0 0 1
[37,] 0 0 1
[38,] 0 1 0
[39,] 0 0 1
[40,] 0 1 0
[41,] 0 1 0
[42,] 0 1 0
[43,] 0 1 0
[44,] 0 1 0
[45,] 0 0 1
[46,] 0 1 0
[47,] 0 1 0
[48,] 0 0 1
[49,] 0 1 0
[50,] 0 1 0
[51,] 0 1 0
[52,] 0 1 0
[53,] 0 1 0
[54,] 0 1 0
[55,] 0 1 0
[56,] 1 0 0
[57,] 0 0 1
[58,] 0 1 0
[59,] 0 1 0
[60,] 0 1 0
[61,] 1 0 0
[62,] 0 0 1
[63,] 0 0 1
[64,] 0 0 1
[65,] 0 1 0
[66,] 1 0 0
[67,] 0 1 0
[68,] 0 1 0
[69,] 0 1 0
[70,] 0 1 0
[71,] 0 1 0
[72,] 0 0 1
[73,] 0 1 0
[74,] 0 1 0
[75,] 0 1 0
[76,] 0 0 1
[77,] 0 0 1
[78,] 0 1 0
[79,] 0 1 0
[80,] 1 0 0
[81,] 0 1 0
[82,] 0 0 1
[83,] 0 1 0
[84,] 0 0 1
[85,] 0 0 1
[86,] 0 1 0
[87,] 1 0 0
[88,] 1 0 0
[89,] 0 1 0
[90,] 0 1 0
[91,] 0 0 1
[92,] 0 0 1
[93,] 0 1 0
[94,] 0 1 0
[95,] 0 0 1
[96,] 0 1 0
[97,] 0 0 1
[98,] 0 1 0
[99,] 1 0 0
[100,] 0 0 1
[101,] 0 0 1
[102,] 0 0 1
[103,] 1 0 0
[104,] 0 0 1
[105,] 0 1 0
[106,] 0 1 0
[107,] 0 1 0
[108,] 0 0 1
[109,] 0 1 0
[110,] 0 1 0
[111,] 0 0 1
[112,] 0 1 0
[113,] 0 0 1
[114,] 0 1 0
[115,] 0 1 0
[116,] 0 0 1
[117,] 0 0 1
[118,] 0 1 0
[119,] 0 0 1
[120,] 0 1 0
[121,] 1 0 0
[122,] 0 1 0
[123,] 0 1 0
[124,] 1 0 0
[125,] 1 0 0
[126,] 0 1 0
[127,] 0 1 0
[128,] 0 1 0
[129,] 0 1 0
[130,] 0 1 0
[131,] 1 0 0
[132,] 1 0 0
[133,] 1 0 0
[134,] 0 0 1
[135,] 0 0 1
[136,] 0 0 1
[137,] 0 1 0
[138,] 1 0 0
[139,] 0 1 0
[140,] 0 1 0
[141,] 0 1 0
[142,] 0 1 0
[143,] 0 1 0
[144,] 0 1 0
[145,] 1 0 0
[146,] 0 1 0
[147,] 0 1 0
[148,] 0 1 0
[149,] 0 0 1
[150,] 0 0 1
[151,] 0 1 0
[152,] 0 1 0
[153,] 0 1 0
[154,] 0 0 1
[155,] 1 0 0
[156,] 0 0 1
[157,] 1 0 0
[158,] 0 1 0
[159,] 0 0 1
[160,] 0 1 0
[161,] 0 1 0
[162,] 0 1 0
[163,] 0 1 0
[164,] 0 1 0
[165,] 0 1 0
[166,] 1 0 0
[167,] 0 0 1
[168,] 0 0 1
[169,] 0 1 0
[170,] 0 1 0
[171,] 1 0 0
[172,] 1 0 0
[173,] 1 0 0
[174,] 0 1 0
[175,] 0 1 0
[176,] 0 0 1
[177,] 1 0 0
[178,] 0 0 1
[179,] 0 0 1
[180,] 0 0 1
[181,] 0 1 0
[182,] 0 0 1
[183,] 0 1 0
[184,] 0 1 0
[185,] 0 1 0
[186,] 0 1 0
[187,] 0 1 0
[188,] 0 1 0
[189,] 0 0 1
[190,] 0 0 1
[191,] 0 0 1
[192,] 0 0 1
[193,] 0 1 0
[194,] 0 0 1
[195,] 0 1 0
[196,] 0 0 1
[197,] 1 0 0
[198,] 1 0 0
[199,] 1 0 0
[200,] 0 1 0
[201,] 0 1 0
[202,] 0 1 0
[203,] 0 1 0
[204,] 0 1 0
[205,] 0 0 1
[206,] 0 0 1
[207,] 0 0 1
[208,] 0 0 1
[209,] 0 1 0
[210,] 0 1 0
[211,] 0 1 0
[212,] 0 0 1
[213,] 0 1 0
[214,] 0 1 0
[215,] 0 1 0
[216,] 0 1 0
[217,] 0 1 0
[218,] 1 0 0
[219,] 0 0 1
[220,] 1 0 0
[221,] 0 0 1
[222,] 0 1 0
[223,] 0 1 0
[224,] 0 1 0
[225,] 0 0 1
[226,] 0 0 1
[227,] 0 1 0
[228,] 1 0 0
[229,] 0 1 0
[230,] 0 0 1
[231,] 0 1 0
[232,] 0 1 0
[233,] 0 0 1
[234,] 0 1 0
[235,] 0 1 0
[236,] 0 1 0
[237,] 0 1 0
[238,] 0 1 0
[239,] 0 0 1
[240,] 0 0 1
[241,] 0 0 1
[242,] 0 0 1
[243,] 1 0 0
[244,] 0 0 1
[245,] 0 1 0
[246,] 0 1 0
[247,] 0 1 0
[248,] 0 0 1
[249,] 0 1 0
[250,] 0 0 1
[251,] 0 0 1
[252,] 0 0 1
[253,] 0 1 0
[254,] 0 1 0
[255,] 0 1 0
[256,] 0 1 0
[257,] 0 1 0
[258,] 0 0 1
[259,] 0 0 1
[260,] 0 1 0
[261,] 0 1 0
[262,] 0 1 0
[263,] 0 0 1
[264,] 0 0 1
[265,] 0 1 0
[266,] 0 1 0
[267,] 0 0 1
[268,] 0 0 1
[269,] 0 0 1
[270,] 0 1 0
[271,] 0 1 0
[272,] 0 1 0
[273,] 0 1 0
[274,] 1 0 0
[275,] 0 1 0
[276,] 1 0 0
[277,] 0 0 1
[278,] 0 0 1
[279,] 0 1 0
[280,] 0 1 0
[281,] 0 0 1
[282,] 0 0 1
[283,] 0 0 1
[284,] 0 0 1
[285,] 1 0 0
[286,] 0 0 1
[287,] 0 1 0
[288,] 0 1 0
[289,] 0 0 1
[290,] 0 1 0
[291,] 1 0 0
[292,] 0 1 0
[293,] 0 0 1
[294,] 0 0 1
[295,] 1 0 0
[296,] 0 1 0
[297,] 0 0 1
[298,] 0 0 1
[299,] 0 0 1
[300,] 0 0 1
[301,] 1 0 0
[302,] 0 0 1
[303,] 0 1 0
[304,] 0 1 0
[305,] 0 1 0
[306,] 0 1 0
[307,] 0 0 1
[308,] 0 1 0
[309,] 0 1 0
[310,] 0 0 1
[311,] 0 1 0
[312,] 0 1 0
[313,] 0 0 1
[314,] 0 1 0
[315,] 0 1 0
[316,] 0 0 1
[317,] 0 1 0
[318,] 1 0 0
[319,] 0 1 0
[320,] 0 1 0
[321,] 0 0 1
[322,] 0 0 1
[323,] 0 1 0
[324,] 0 0 1
[325,] 0 0 1
[326,] 1 0 0
[327,] 0 1 0
[328,] 0 1 0
[329,] 0 1 0
[330,] 0 1 0
[331,] 0 1 0
[332,] 1 0 0
[333,] 0 1 0
[ reached getOption("max.print") -- omitted 4955 rows ]
bank_data$marital <- NULL
bank_data
cbind(bank_data,dummy(bank_data$education, sep = "_"))
education_dummy <- dummy(bank_data$education, sep = "_")
education_dummy
education_primary education_secondary education_tertiary education_unknown
[1,] 0 0 1 0
[2,] 0 1 0 0
[3,] 0 0 1 0
[4,] 0 1 0 0
[5,] 0 0 1 0
[6,] 0 0 0 1
[7,] 0 1 0 0
[8,] 1 0 0 0
[9,] 0 1 0 0
[10,] 0 1 0 0
[11,] 0 1 0 0
[12,] 0 0 1 0
[13,] 1 0 0 0
[14,] 1 0 0 0
[15,] 0 0 1 0
[16,] 0 1 0 0
[17,] 1 0 0 0
[18,] 0 0 1 0
[19,] 0 1 0 0
[20,] 0 1 0 0
[21,] 0 0 1 0
[22,] 0 1 0 0
[23,] 0 1 0 0
[24,] 0 1 0 0
[25,] 0 1 0 0
[26,] 0 1 0 0
[27,] 0 1 0 0
[28,] 1 0 0 0
[29,] 0 0 1 0
[30,] 1 0 0 0
[31,] 0 1 0 0
[32,] 0 0 0 1
[33,] 0 1 0 0
[34,] 0 0 1 0
[35,] 1 0 0 0
[36,] 0 0 1 0
[37,] 0 0 1 0
[38,] 0 0 1 0
[39,] 0 1 0 0
[40,] 0 0 0 1
[41,] 0 1 0 0
[42,] 0 0 1 0
[43,] 0 1 0 0
[44,] 0 1 0 0
[45,] 0 0 1 0
[46,] 0 1 0 0
[47,] 0 1 0 0
[48,] 1 0 0 0
[49,] 0 1 0 0
[50,] 1 0 0 0
[51,] 0 1 0 0
[52,] 0 0 1 0
[53,] 0 1 0 0
[54,] 0 0 1 0
[55,] 0 1 0 0
[56,] 0 1 0 0
[57,] 0 1 0 0
[58,] 0 1 0 0
[59,] 0 0 1 0
[60,] 0 0 1 0
[61,] 1 0 0 0
[62,] 0 0 1 0
[63,] 0 1 0 0
[64,] 0 0 1 0
[65,] 0 0 1 0
[66,] 0 0 1 0
[67,] 1 0 0 0
[68,] 0 0 0 1
[69,] 0 1 0 0
[70,] 0 0 1 0
[71,] 0 1 0 0
[72,] 0 1 0 0
[73,] 0 1 0 0
[74,] 0 0 1 0
[75,] 0 0 1 0
[76,] 0 0 1 0
[77,] 0 1 0 0
[78,] 0 0 1 0
[79,] 0 1 0 0
[80,] 0 1 0 0
[81,] 0 0 1 0
[82,] 0 0 1 0
[83,] 0 0 1 0
[84,] 0 1 0 0
[85,] 0 1 0 0
[86,] 0 0 1 0
[87,] 0 1 0 0
[88,] 1 0 0 0
[89,] 0 1 0 0
[90,] 0 1 0 0
[91,] 0 0 1 0
[92,] 0 0 1 0
[93,] 0 1 0 0
[94,] 0 1 0 0
[95,] 0 0 0 1
[96,] 0 0 1 0
[97,] 0 1 0 0
[98,] 0 1 0 0
[99,] 0 1 0 0
[100,] 0 0 1 0
[101,] 0 0 1 0
[102,] 0 1 0 0
[103,] 0 1 0 0
[104,] 0 0 1 0
[105,] 0 0 1 0
[106,] 0 1 0 0
[107,] 0 1 0 0
[108,] 1 0 0 0
[109,] 0 1 0 0
[110,] 0 0 1 0
[111,] 0 0 1 0
[112,] 0 1 0 0
[113,] 0 1 0 0
[114,] 0 0 1 0
[115,] 0 1 0 0
[116,] 1 0 0 0
[117,] 0 0 1 0
[118,] 0 1 0 0
[119,] 0 1 0 0
[120,] 0 0 1 0
[121,] 0 0 1 0
[122,] 0 1 0 0
[123,] 0 1 0 0
[124,] 0 0 1 0
[125,] 0 0 0 1
[126,] 0 1 0 0
[127,] 0 1 0 0
[128,] 0 1 0 0
[129,] 0 0 1 0
[130,] 1 0 0 0
[131,] 0 0 1 0
[132,] 0 1 0 0
[133,] 0 1 0 0
[134,] 0 0 1 0
[135,] 0 0 1 0
[136,] 0 1 0 0
[137,] 0 1 0 0
[138,] 0 1 0 0
[139,] 0 0 1 0
[140,] 0 1 0 0
[141,] 0 1 0 0
[142,] 0 1 0 0
[143,] 0 0 1 0
[144,] 0 0 1 0
[145,] 0 0 1 0
[146,] 1 0 0 0
[147,] 1 0 0 0
[148,] 0 1 0 0
[149,] 0 1 0 0
[150,] 0 0 1 0
[151,] 0 0 0 1
[152,] 0 1 0 0
[153,] 0 0 1 0
[154,] 0 0 1 0
[155,] 0 0 1 0
[156,] 0 0 1 0
[157,] 0 1 0 0
[158,] 0 0 1 0
[159,] 0 0 1 0
[160,] 0 1 0 0
[161,] 0 0 1 0
[162,] 0 0 0 1
[163,] 0 0 1 0
[164,] 0 1 0 0
[165,] 0 1 0 0
[166,] 0 0 1 0
[167,] 0 1 0 0
[168,] 0 0 1 0
[169,] 0 1 0 0
[170,] 0 1 0 0
[171,] 0 1 0 0
[172,] 1 0 0 0
[173,] 0 0 1 0
[174,] 0 1 0 0
[175,] 0 1 0 0
[176,] 0 1 0 0
[177,] 0 0 1 0
[178,] 0 1 0 0
[179,] 0 1 0 0
[180,] 0 1 0 0
[181,] 0 0 1 0
[182,] 0 0 1 0
[183,] 0 0 1 0
[184,] 0 1 0 0
[185,] 0 1 0 0
[186,] 0 1 0 0
[187,] 0 1 0 0
[188,] 0 0 1 0
[189,] 0 0 1 0
[190,] 0 0 1 0
[191,] 0 1 0 0
[192,] 0 0 1 0
[193,] 0 0 1 0
[194,] 0 0 1 0
[195,] 0 1 0 0
[196,] 0 1 0 0
[197,] 0 0 0 1
[198,] 1 0 0 0
[199,] 0 1 0 0
[200,] 0 0 1 0
[201,] 1 0 0 0
[202,] 0 0 1 0
[203,] 0 1 0 0
[204,] 0 0 1 0
[205,] 0 0 1 0
[206,] 0 0 1 0
[207,] 0 0 1 0
[208,] 0 0 0 1
[209,] 0 0 1 0
[210,] 0 1 0 0
[211,] 0 0 0 1
[212,] 0 1 0 0
[213,] 0 1 0 0
[214,] 0 0 1 0
[215,] 0 0 1 0
[216,] 0 0 1 0
[217,] 0 0 1 0
[218,] 0 1 0 0
[219,] 0 0 1 0
[220,] 0 1 0 0
[221,] 0 1 0 0
[222,] 1 0 0 0
[223,] 0 1 0 0
[224,] 0 1 0 0
[225,] 0 1 0 0
[226,] 0 1 0 0
[227,] 0 1 0 0
[228,] 0 1 0 0
[229,] 0 0 1 0
[230,] 0 1 0 0
[231,] 0 0 1 0
[232,] 0 0 1 0
[233,] 0 0 1 0
[234,] 0 0 0 1
[235,] 0 0 1 0
[236,] 0 0 1 0
[237,] 1 0 0 0
[238,] 0 1 0 0
[239,] 0 0 1 0
[240,] 0 0 1 0
[241,] 0 0 1 0
[242,] 0 0 1 0
[243,] 0 1 0 0
[244,] 0 0 1 0
[245,] 0 0 1 0
[246,] 0 1 0 0
[247,] 1 0 0 0
[248,] 0 1 0 0
[249,] 0 1 0 0
[250,] 0 1 0 0
[ reached getOption("max.print") -- omitted 5038 rows ]
bank_data$education <- NULL
bank_data
cbind(bank_data,dummy(bank_data$contact, sep = "_"))
education_dummy <- dummy(bank_data$contact, sep = "_")
education_dummy
contact_cellular contact_telephone contact_unknown
[1,] 1 0 0
[2,] 1 0 0
[3,] 1 0 0
[4,] 1 0 0
[5,] 1 0 0
[6,] 1 0 0
[7,] 1 0 0
[8,] 0 1 0
[9,] 1 0 0
[10,] 0 0 1
[11,] 1 0 0
[12,] 1 0 0
[13,] 1 0 0
[14,] 1 0 0
[15,] 1 0 0
[16,] 1 0 0
[17,] 1 0 0
[18,] 1 0 0
[19,] 1 0 0
[20,] 1 0 0
[21,] 1 0 0
[22,] 1 0 0
[23,] 1 0 0
[24,] 1 0 0
[25,] 1 0 0
[26,] 1 0 0
[27,] 0 0 1
[28,] 1 0 0
[29,] 1 0 0
[30,] 1 0 0
[31,] 1 0 0
[32,] 0 1 0
[33,] 1 0 0
[34,] 1 0 0
[35,] 1 0 0
[36,] 1 0 0
[37,] 0 0 1
[38,] 1 0 0
[39,] 1 0 0
[40,] 1 0 0
[41,] 0 1 0
[42,] 0 1 0
[43,] 1 0 0
[44,] 1 0 0
[45,] 1 0 0
[46,] 1 0 0
[47,] 0 0 1
[48,] 1 0 0
[49,] 1 0 0
[50,] 0 1 0
[51,] 1 0 0
[52,] 1 0 0
[53,] 1 0 0
[54,] 1 0 0
[55,] 0 0 1
[56,] 1 0 0
[57,] 1 0 0
[58,] 1 0 0
[59,] 1 0 0
[60,] 1 0 0
[61,] 0 1 0
[62,] 1 0 0
[63,] 1 0 0
[64,] 1 0 0
[65,] 1 0 0
[66,] 1 0 0
[67,] 1 0 0
[68,] 1 0 0
[69,] 0 0 1
[70,] 1 0 0
[71,] 1 0 0
[72,] 1 0 0
[73,] 1 0 0
[74,] 1 0 0
[75,] 1 0 0
[76,] 1 0 0
[77,] 1 0 0
[78,] 1 0 0
[79,] 1 0 0
[80,] 1 0 0
[81,] 1 0 0
[82,] 1 0 0
[83,] 1 0 0
[84,] 1 0 0
[85,] 1 0 0
[86,] 1 0 0
[87,] 1 0 0
[88,] 1 0 0
[89,] 1 0 0
[90,] 1 0 0
[91,] 1 0 0
[92,] 0 0 1
[93,] 1 0 0
[94,] 1 0 0
[95,] 1 0 0
[96,] 1 0 0
[97,] 1 0 0
[98,] 1 0 0
[99,] 1 0 0
[100,] 1 0 0
[101,] 1 0 0
[102,] 1 0 0
[103,] 1 0 0
[104,] 1 0 0
[105,] 1 0 0
[106,] 1 0 0
[107,] 1 0 0
[108,] 1 0 0
[109,] 1 0 0
[110,] 1 0 0
[111,] 1 0 0
[112,] 1 0 0
[113,] 1 0 0
[114,] 1 0 0
[115,] 1 0 0
[116,] 0 0 1
[117,] 1 0 0
[118,] 1 0 0
[119,] 1 0 0
[120,] 1 0 0
[121,] 1 0 0
[122,] 1 0 0
[123,] 1 0 0
[124,] 1 0 0
[125,] 1 0 0
[126,] 1 0 0
[127,] 1 0 0
[128,] 1 0 0
[129,] 0 1 0
[130,] 1 0 0
[131,] 1 0 0
[132,] 0 0 1
[133,] 0 0 1
[134,] 1 0 0
[135,] 1 0 0
[136,] 1 0 0
[137,] 1 0 0
[138,] 1 0 0
[139,] 1 0 0
[140,] 1 0 0
[141,] 1 0 0
[142,] 0 1 0
[143,] 1 0 0
[144,] 1 0 0
[145,] 1 0 0
[146,] 1 0 0
[147,] 1 0 0
[148,] 1 0 0
[149,] 1 0 0
[150,] 1 0 0
[151,] 1 0 0
[152,] 1 0 0
[153,] 1 0 0
[154,] 1 0 0
[155,] 1 0 0
[156,] 1 0 0
[157,] 1 0 0
[158,] 0 0 1
[159,] 1 0 0
[160,] 1 0 0
[161,] 1 0 0
[162,] 1 0 0
[163,] 1 0 0
[164,] 1 0 0
[165,] 1 0 0
[166,] 1 0 0
[167,] 1 0 0
[168,] 1 0 0
[169,] 1 0 0
[170,] 1 0 0
[171,] 1 0 0
[172,] 1 0 0
[173,] 1 0 0
[174,] 1 0 0
[175,] 1 0 0
[176,] 1 0 0
[177,] 1 0 0
[178,] 1 0 0
[179,] 1 0 0
[180,] 1 0 0
[181,] 1 0 0
[182,] 1 0 0
[183,] 1 0 0
[184,] 1 0 0
[185,] 1 0 0
[186,] 1 0 0
[187,] 1 0 0
[188,] 1 0 0
[189,] 1 0 0
[190,] 1 0 0
[191,] 1 0 0
[192,] 1 0 0
[193,] 0 1 0
[194,] 1 0 0
[195,] 1 0 0
[196,] 1 0 0
[197,] 1 0 0
[198,] 1 0 0
[199,] 1 0 0
[200,] 1 0 0
[201,] 1 0 0
[202,] 1 0 0
[203,] 1 0 0
[204,] 1 0 0
[205,] 1 0 0
[206,] 1 0 0
[207,] 1 0 0
[208,] 1 0 0
[209,] 1 0 0
[210,] 1 0 0
[211,] 1 0 0
[212,] 1 0 0
[213,] 0 1 0
[214,] 1 0 0
[215,] 1 0 0
[216,] 1 0 0
[217,] 1 0 0
[218,] 1 0 0
[219,] 1 0 0
[220,] 1 0 0
[221,] 1 0 0
[222,] 1 0 0
[223,] 0 1 0
[224,] 0 1 0
[225,] 1 0 0
[226,] 1 0 0
[227,] 1 0 0
[228,] 0 0 1
[229,] 1 0 0
[230,] 0 0 1
[231,] 1 0 0
[232,] 1 0 0
[233,] 1 0 0
[234,] 1 0 0
[235,] 1 0 0
[236,] 1 0 0
[237,] 1 0 0
[238,] 1 0 0
[239,] 1 0 0
[240,] 1 0 0
[241,] 1 0 0
[242,] 1 0 0
[243,] 0 0 1
[244,] 1 0 0
[245,] 1 0 0
[246,] 1 0 0
[247,] 1 0 0
[248,] 1 0 0
[249,] 1 0 0
[250,] 1 0 0
[251,] 1 0 0
[252,] 1 0 0
[253,] 1 0 0
[254,] 1 0 0
[255,] 1 0 0
[256,] 1 0 0
[257,] 1 0 0
[258,] 1 0 0
[259,] 1 0 0
[260,] 1 0 0
[261,] 1 0 0
[262,] 1 0 0
[263,] 1 0 0
[264,] 0 1 0
[265,] 0 1 0
[266,] 1 0 0
[267,] 0 0 1
[268,] 1 0 0
[269,] 1 0 0
[270,] 1 0 0
[271,] 0 0 1
[272,] 1 0 0
[273,] 1 0 0
[274,] 1 0 0
[275,] 0 1 0
[276,] 1 0 0
[277,] 1 0 0
[278,] 1 0 0
[279,] 1 0 0
[280,] 1 0 0
[281,] 0 0 1
[282,] 1 0 0
[283,] 1 0 0
[284,] 1 0 0
[285,] 0 0 1
[286,] 1 0 0
[287,] 1 0 0
[288,] 1 0 0
[289,] 1 0 0
[290,] 1 0 0
[291,] 0 1 0
[292,] 0 1 0
[293,] 1 0 0
[294,] 1 0 0
[295,] 1 0 0
[296,] 1 0 0
[297,] 1 0 0
[298,] 1 0 0
[299,] 0 0 1
[300,] 1 0 0
[301,] 1 0 0
[302,] 1 0 0
[303,] 1 0 0
[304,] 1 0 0
[305,] 1 0 0
[306,] 1 0 0
[307,] 1 0 0
[308,] 1 0 0
[309,] 0 0 1
[310,] 1 0 0
[311,] 1 0 0
[312,] 1 0 0
[313,] 1 0 0
[314,] 0 1 0
[315,] 1 0 0
[316,] 1 0 0
[317,] 0 0 1
[318,] 1 0 0
[319,] 0 1 0
[320,] 0 1 0
[321,] 1 0 0
[322,] 1 0 0
[323,] 1 0 0
[324,] 1 0 0
[325,] 1 0 0
[326,] 0 1 0
[327,] 1 0 0
[328,] 1 0 0
[329,] 1 0 0
[330,] 0 1 0
[331,] 1 0 0
[332,] 1 0 0
[333,] 1 0 0
[ reached getOption("max.print") -- omitted 4955 rows ]
bank_data$contact <- NULL
bank_data
cbind(bank_data,dummy(bank_data$month, sep = "_"))
month_dummy <- dummy(bank_data$month, sep = "_")
month_dummy
month_apr month_aug month_dec month_feb month_jan month_jul month_jun month_mar
[1,] 0 0 0 0 1 0 0 0
[2,] 1 0 0 0 0 0 0 0
[3,] 0 0 0 0 0 0 1 0
[4,] 0 0 0 0 0 1 0 0
[5,] 0 1 0 0 0 0 0 0
[6,] 0 0 0 0 0 1 0 0
[7,] 0 1 0 0 0 0 0 0
[8,] 0 0 0 0 0 0 0 0
[9,] 0 1 0 0 0 0 0 0
[10,] 0 0 0 0 0 0 1 0
[11,] 0 1 0 0 0 0 0 0
[12,] 0 1 0 0 0 0 0 0
[13,] 0 0 0 0 0 0 0 0
[14,] 1 0 0 0 0 0 0 0
[15,] 0 0 0 1 0 0 0 0
[16,] 0 0 0 0 0 0 0 0
[17,] 0 0 0 0 0 0 0 0
[18,] 0 0 0 0 0 0 0 0
[19,] 1 0 0 0 0 0 0 0
[20,] 0 0 0 1 0 0 0 0
[21,] 0 1 0 0 0 0 0 0
[22,] 0 1 0 0 0 0 0 0
[23,] 0 0 0 0 0 0 0 0
[24,] 0 0 0 0 0 1 0 0
[25,] 0 0 0 0 1 0 0 0
[26,] 1 0 0 0 0 0 0 0
[27,] 0 0 0 0 0 0 0 0
[28,] 1 0 0 0 0 0 0 0
[29,] 0 0 0 0 0 0 0 0
[30,] 0 0 0 0 0 0 0 1
[31,] 0 0 0 0 0 0 0 0
[32,] 0 1 0 0 0 0 0 0
[33,] 0 0 0 0 0 0 0 0
[34,] 0 1 0 0 0 0 0 0
[35,] 0 0 0 0 0 0 0 1
[36,] 0 0 0 0 0 0 0 0
[37,] 0 0 0 1 0 0 0 0
[38,] 0 0 0 0 0 1 0 0
[39,] 0 0 0 0 0 0 0 0
[40,] 1 0 0 0 0 0 0 0
[41,] 0 0 0 0 0 0 0 1
[42,] 0 0 0 0 0 0 0 0
[43,] 0 0 0 1 0 0 0 0
[44,] 0 0 0 0 0 0 1 0
[45,] 0 0 0 0 0 0 0 0
[46,] 0 0 0 1 0 0 0 0
[47,] 0 0 0 0 0 0 0 0
[48,] 0 0 0 0 0 0 0 0
[49,] 0 0 1 0 0 0 0 0
[50,] 0 0 0 0 0 0 0 0
[51,] 0 0 0 0 0 0 0 0
[52,] 0 0 0 0 0 0 0 0
[53,] 0 0 0 0 0 0 0 0
[54,] 0 0 0 0 0 0 0 0
[55,] 0 0 0 0 0 0 1 0
[56,] 0 0 0 0 0 1 0 0
[57,] 0 0 0 1 0 0 0 0
[58,] 0 0 0 0 0 0 0 0
[59,] 0 0 0 0 0 0 0 0
[60,] 1 0 0 0 0 0 0 0
[61,] 0 0 0 0 0 1 0 0
[62,] 0 0 0 1 0 0 0 0
[63,] 0 0 0 0 0 0 0 0
[64,] 0 0 0 0 0 0 0 1
[65,] 0 0 0 0 0 0 0 0
[66,] 0 0 0 0 0 0 0 0
[67,] 0 0 0 0 0 0 0 0
[68,] 0 0 0 0 1 0 0 0
[69,] 0 0 0 0 0 0 1 0
[70,] 0 0 0 0 0 0 0 0
[71,] 0 0 0 0 0 0 0 1
[72,] 1 0 0 0 0 0 0 0
[73,] 0 0 0 0 0 0 0 1
[74,] 0 0 0 0 0 0 0 1
[75,] 0 0 0 0 0 0 0 0
[76,] 0 0 0 0 0 0 1 0
[77,] 0 0 0 0 0 0 0 0
[78,] 0 0 0 0 0 0 0 0
[79,] 0 0 0 1 0 0 0 0
[80,] 0 1 0 0 0 0 0 0
[81,] 1 0 0 0 0 0 0 0
[82,] 0 0 0 0 0 0 0 0
[83,] 1 0 0 0 0 0 0 0
month_may month_nov month_oct month_sep
[1,] 0 0 0 0
[2,] 0 0 0 0
[3,] 0 0 0 0
[4,] 0 0 0 0
[5,] 0 0 0 0
[6,] 0 0 0 0
[7,] 0 0 0 0
[8,] 0 0 0 1
[9,] 0 0 0 0
[10,] 0 0 0 0
[11,] 0 0 0 0
[12,] 0 0 0 0
[13,] 0 0 1 0
[14,] 0 0 0 0
[15,] 0 0 0 0
[16,] 1 0 0 0
[17,] 0 1 0 0
[18,] 0 0 1 0
[19,] 0 0 0 0
[20,] 0 0 0 0
[21,] 0 0 0 0
[22,] 0 0 0 0
[23,] 0 0 0 1
[24,] 0 0 0 0
[25,] 0 0 0 0
[26,] 0 0 0 0
[27,] 1 0 0 0
[28,] 0 0 0 0
[29,] 1 0 0 0
[30,] 0 0 0 0
[31,] 1 0 0 0
[32,] 0 0 0 0
[33,] 1 0 0 0
[34,] 0 0 0 0
[35,] 0 0 0 0
[36,] 0 0 1 0
[37,] 0 0 0 0
[38,] 0 0 0 0
[39,] 0 1 0 0
[40,] 0 0 0 0
[41,] 0 0 0 0
[42,] 0 0 0 1
[43,] 0 0 0 0
[44,] 0 0 0 0
[45,] 1 0 0 0
[46,] 0 0 0 0
[47,] 1 0 0 0
[48,] 1 0 0 0
[49,] 0 0 0 0
[50,] 0 0 1 0
[51,] 1 0 0 0
[52,] 1 0 0 0
[53,] 1 0 0 0
[54,] 0 1 0 0
[55,] 0 0 0 0
[56,] 0 0 0 0
[57,] 0 0 0 0
[58,] 1 0 0 0
[59,] 0 0 0 1
[60,] 0 0 0 0
[61,] 0 0 0 0
[62,] 0 0 0 0
[63,] 1 0 0 0
[64,] 0 0 0 0
[65,] 0 0 0 1
[66,] 0 1 0 0
[67,] 1 0 0 0
[68,] 0 0 0 0
[69,] 0 0 0 0
[70,] 1 0 0 0
[71,] 0 0 0 0
[72,] 0 0 0 0
[73,] 0 0 0 0
[74,] 0 0 0 0
[75,] 1 0 0 0
[76,] 0 0 0 0
[77,] 1 0 0 0
[78,] 0 1 0 0
[79,] 0 0 0 0
[80,] 0 0 0 0
[81,] 0 0 0 0
[82,] 0 1 0 0
[83,] 0 0 0 0
[ reached getOption("max.print") -- omitted 5205 rows ]
bank_data$month <- NULL
bank_data
bank_data$default <- as.numeric(as.character(factor(bank_data$default,levels=c('yes','no'),
labels =c(1,0) )))
bank_data$default
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[45] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[89] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[133] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[177] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[221] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[265] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[309] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[353] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[397] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[441] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[485] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[529] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[573] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[617] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[661] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[705] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[749] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[793] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[837] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[881] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[925] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[969] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[ reached getOption("max.print") -- omitted 4288 entries ]
bank_data$housing <- as.numeric(as.character(factor(bank_data$housing,levels=c('yes','no'),
labels =c(1,0) )))
bank_data$housing
[1] 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1
[45] 0 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1
[89] 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 0
[133] 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0
[177] 0 0 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1
[221] 1 1 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0
[265] 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 0 1
[309] 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1
[353] 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 1
[397] 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0
[441] 1 0 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1 0 1 1 0
[485] 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
[529] 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0
[573] 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1
[617] 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0
[661] 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0
[705] 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0
[749] 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1
[793] 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0
[837] 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
[881] 1 1 0 0 1 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 1
[925] 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1
[969] 1 0 0 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1
[ reached getOption("max.print") -- omitted 4288 entries ]
bank_data$loan <- as.numeric(as.character(factor(bank_data$loan,levels = c('yes','no'),
labels =c(1,0) )))
bank_data$loan
[1] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
[45] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
[89] 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[133] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[177] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
[221] 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
[265] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1
[309] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0
[353] 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0
[397] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[441] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[485] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
[529] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
[573] 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
[617] 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[661] 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
[705] 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[749] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
[793] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[837] 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
[881] 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[925] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
[969] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
[ reached getOption("max.print") -- omitted 4288 entries ]
#bank_data$y <- as.numeric(as.character(factor(bank_data$y,levels = c('yes','no'),
# labels =c(1,0))))
#bank_data$y
View(bank_data)
bank_dataset <- cbind(job_dummy,marital_dummy,education_dummy,month_dummy,pouctome_dummy,(bank_data))
View(bank_dataset)
normalize_data <- bank_dataset
norm.values <- preProcess(normalize_data[,1:41], method = c("center", "scale"))
normalize_data[,1:41] <- predict(norm.values, normalize_data[,1:44])
Error in `[.data.frame`(normalize_data, , 1:44) :
undefined columns selected
set.seed(1)
idx <- sample(seq(1, 3), size = nrow(normalize_data), replace = TRUE, prob = c(.7, .2, .1))
train.df <- normalize_data[idx == 1,]
valid.df <- normalize_data[idx == 2,]
test.df <- normalize_data[idx == 3,]
nrow(train.df)
[1] 3675
nrow(valid.df)
[1] 1052
nrow(test.df)
[1] 561
install.packages(“ltm”)
biserial.cor(bank.full$age, bank.full$y, use = c("all.obs", "complete.obs"), level = 1)
[1] -0.02515502
cor_data <- train.df[c('age','balance','day','duration','campaign','pdays','previous')]
cor(cor_data)
age balance day duration campaign pdays
age 1.0000000000 0.112284427 -0.009183551 0.0005928877 -0.008332416 0.007017854
balance 0.1122844272 1.000000000 -0.014891864 0.0072597918 -0.009986158 0.017135883
day -0.0091835506 -0.014891864 1.000000000 -0.0176419511 0.118191369 -0.063199263
duration 0.0005928877 0.007259792 -0.017641951 1.0000000000 -0.022193163 -0.058779389
campaign -0.0083324157 -0.009986158 0.118191369 -0.0221931628 1.000000000 -0.095514860
pdays 0.0070178536 0.017135883 -0.063199263 -0.0587793889 -0.095514860 1.000000000
previous 0.0253065448 0.046640417 -0.048067064 -0.0295119004 -0.034617033 0.501037123
previous
age 0.02530654
balance 0.04664042
day -0.04806706
duration -0.02951190
campaign -0.03461703
pdays 0.50103712
previous 1.00000000
par(mfrow=c(2,2))
boxplot(train.df$age ~ train.df$y,main="Age", col=c('powderblue', 'mistyrose'))
boxplot(train.df$balance ~ train.df$y,main="Balance", col=c('powderblue', 'mistyrose'))
boxplot(train.df$day ~ train.df$y,main="day", col=c('powderblue', 'mistyrose'))
boxplot(train.df$duration ~ train.df$y,main="duration", col=c('powderblue', 'mistyrose'))
boxplot(train.df$campaign ~ train.df$y,main="campaign", col=c('powderblue', 'mistyrose'))
boxplot(train.df$pdays ~ train.df$y,main="pdays", col=c('powderblue', 'mistyrose'))
boxplot(train.df$previous ~ train.df$y,main="previous", col=c('powderblue', 'mistyrose'))
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
marital_duration<-summarise(group_by(bank.full,marital),duration=mean(duration))
marital_duration
p1<-ggplot(marital_duration,aes(x=marital,y=duration,fill=marital))+
geom_bar(stat='identity')
marital_balance<-summarise(group_by(bank.full,marital),balance=mean(balance))
marital_balance
p2<-ggplot(marital_balance,aes(x=marital,y=balance,fill=marital))+ geom_bar(stat='identity')
marital_age<-summarise(group_by(bank.full,marital),age=mean(age))
marital_age
p3<-ggplot(marital_age,aes(x=marital,y=age,fill=marital))+ geom_bar(stat='identity')
marital_pdays<-summarise(group_by(bank.full,marital),pdays=mean(pdays))
marital_pdays
p4<-ggplot(marital_pdays,aes(x=marital,y=pdays,fill=marital))+ geom_bar(stat='identity')
multiplot(p1, p2, p3, p4, cols=2)
job_duration<-summarise(group_by(bank.full,job),duration=mean(duration))
job_duration
p1<-ggplot(job_duration,aes(x=job,y=duration,fill=job))+
geom_bar(stat='identity')+theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 0.5))
job_balance<-summarise(group_by(bank.full,job),balance=mean(balance))
job_balance
p2<-ggplot(job_balance,aes(x=job,y=balance,fill=job))+
geom_bar(stat='identity')+theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 0.5))
job_age<-summarise(group_by(bank.full,job),age=mean(age))
job_age
p3<-ggplot(job_age,aes(x=job,y=age,fill=job))+
geom_bar(stat='identity')+theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 0.5))
job_pdays<-summarise(group_by(bank.full,job),pdays=mean(pdays))
job_pdays
p4<-ggplot(job_pdays,aes(x=job,y=pdays,fill=job))+
geom_bar(stat='identity')+theme(axis.text.x = element_text(angle = 45,
hjust = 1, vjust = 0.5))
multiplot(p1, p2, p3, p4, cols=2)
education_duration<-summarise(group_by(bank.full,education),duration=mean(duration))
education_duration
p1<-ggplot(education_duration,aes(x=education,y=duration,fill=education))+
geom_bar(stat='identity')
education_balance<-summarise(group_by(bank.full,education),balance=mean(balance))
education_balance
p2<-ggplot(education_balance,aes(x=education,y=balance,fill=education))+
geom_bar(stat='identity')
education_age<-summarise(group_by(bank.full,education),age=mean(age))
education_age
p3<-ggplot(education_age,aes(x=education,y=age,fill=education))+ geom_bar(stat='identity')
education_pdays<-summarise(group_by(bank.full,education),age=mean(pdays))
education_pdays
p4<-ggplot(education_pdays,aes(x=education,y=age,fill=education))+ geom_bar(stat='identity')
multiplot(p1, p2, p3, p4, cols=2)
ggplot(bank.full,aes(x=education,fill=education))+ geom_bar(stat='count',aes(fill =
factor(y)),position = position_dodge(width = 0.9))
ggplot(bank.full,aes(x=marital,fill=marital))+ geom_bar(stat='count',aes(fill =
factor(y)),position = position_dodge(width = 0.9))
ggplot(bank.full,aes(x=job,fill=job))+ geom_bar(stat='count',aes(fill =
factor(y)),position = position_dodge(width = 0.9))+theme(axis.text.x =
element_text(angle = 45, hjust = 1, vjust = 0.5))
ggplot(bank.full,aes(x=contact,fill=contact))+ geom_bar(stat='count',aes(fill =
factor(y)),position = position_dodge(width = 0.9))
ggplot(bank.full,aes(x=contact,fill=contact))+ geom_bar(stat='count',aes(fill =
factor(age)),position = position_dodge(width = 0.9))
par(mfrow=c(2,2))
plot(log(train.df$age), log(train.df$balance), main = "Age Vs Balance", xlab = "Age", ylab = "Balance", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$balance) ~ log(train.df$age)))
NaNs producedNaNs produced
plot(log(train.df$age), log(train.df$duration), main = "Age Vs Duration", xlab = "Age", ylab = "Duration", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$duration) ~ log(train.df$age)))
NaNs producedNaNs produced
plot(log(train.df$age), log(train.df$pdays), main = "Age Vs Days Past", xlab = "Age", ylab = "Days Past", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$pdays) ~ log(train.df$age)))
NaNs producedNaNs produced
plot(log(train.df$age), log(train.df$previous), main = "Age Vs Previously Contacted", xlab = "Age", ylab = "Previously Contacted", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$previous) ~ log(train.df$age)))
NaNs producedNaNs produced
plot(log(train.df$age), log(train.df$day), main = "Age Vs Day", xlab = "Age", ylab = "Day", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$day) ~ log(train.df$age)))
NaNs producedNaNs produced
plot(log(train.df$age), log(train.df$campaign), main = "Age Vs Campaign", xlab = "Age", ylab = "Campaign", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$campaign) ~ log(train.df$age)))
NaNs producedNaNs produced
par(mfrow=c(2,2))
plot(log(train.df$balance), log(train.df$duration), main = "Duration Vs Balance", xlab = "Balance", ylab = "Duration", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$duration) ~ log(train.df$balance)))
NaNs producedNaNs produced
plot(log(train.df$duration), log(train.df$pdays), main = "Duration Vs Days Past", xlab = "Duration", ylab = "Days Past", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$pdays) ~ log(train.df$duration)))
NaNs producedNaNs produced
plot(log(train.df$duration), log(train.df$previous), main = "Duration Vs Previously Contacted", xlab = "Duration", ylab = "Previously Contacted", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$previous) ~ log(train.df$duration)))
NaNs producedNaNs produced
plot(log(train.df$duration), log(train.df$day), main = "Duration Vs Day", xlab = "Duration", ylab = "Day", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$day) ~ log(train.df$duration)))
NaNs producedNaNs produced
plot(log(train.df$duration), log(train.df$campaign), main = "Duration Vs Campaign", xlab = "Duration", ylab = "Campaign", col = 2)
NaNs producedNaNs produced
abline(lm(log(train.df$campaign) ~ log(train.df$duration)))
NaNs producedNaNs produced
install.packages(“readxl”) install.packages(‘psych’)
fa.parallel(normal_data, fm="pa", main = "Scree Plot With Parallel Analysis")
Parallel analysis suggests that the number of factors = 3 and the number of components = 3
pc <- principal(r = normal_data, nfactor = 3, rotate = "none")
pc
Principal Components Analysis
Call: principal(r = normal_data, nfactors = 3, rotate = "none")
Standardized loadings (pattern matrix) based upon correlation matrix
PC1 PC2 PC3
SS loadings 1.55 1.12 1.10
Proportion Var 0.22 0.16 0.16
Cumulative Var 0.22 0.38 0.54
Proportion Explained 0.41 0.30 0.29
Cumulative Proportion 0.41 0.71 1.00
Mean item complexity = 1.8
Test of the hypothesis that 3 components are sufficient.
The root mean square of the residuals (RMSR) is 0.16
with the empirical chi square 3852.01 with prob < 0
Fit based upon off diagonal values = -0.73
pc_rotate
Principal Components Analysis
Call: principal(r = cont_data, nfactors = 3, rotate = "varimax")
Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2 RC3
SS loadings 1.47 1.20 1.10
Proportion Var 0.21 0.17 0.16
Cumulative Var 0.21 0.38 0.54
Proportion Explained 0.39 0.32 0.29
Cumulative Proportion 0.39 0.71 1.00
Mean item complexity = 1
Test of the hypothesis that 3 components are sufficient.
The root mean square of the residuals (RMSR) is 0.16
with the empirical chi square 48046.71 with prob < 0
Fit based upon off diagonal values = -0.95
pc_score <- principal(normal_data, nfactor = 3, scores = TRUE)
head(pc_score$scores)
RC1 RC3 RC2
27766 -0.8019654 -0.3078275 -0.8798434
32554 -0.5098935 -0.5152472 -0.6841798
40346 1.8285781 0.1695993 -0.3515448
22128 -0.5807649 0.4997158 0.6121206
41611 0.3263004 -0.1517610 2.1938552
41205 0.4302717 0.4789325 -0.5068165
logit_model <- glm(y ~ .,data = train.df, family = binomial(link = "logit"))
summary(logit_model)
Call:
glm(formula = y ~ ., family = binomial(link = "logit"), data = train.df)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.1113 -0.5643 0.0026 0.6066 2.9976
Coefficients: (4 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.148997 0.048483 3.073 0.00212 **
job_admin. -0.063900 0.176641 -0.362 0.71754
`job_blue-collar` -0.284535 0.210942 -1.349 0.17738
job_entrepreneur -0.050385 0.099942 -0.504 0.61416
job_housemaid -0.109636 0.096304 -1.138 0.25494
job_management -0.103918 0.225294 -0.461 0.64462
job_retired 0.089117 0.147935 0.602 0.54690
`job_self-employed` -0.099692 0.111503 -0.894 0.37128
job_services -0.188692 0.155252 -1.215 0.22422
job_student 0.042809 0.107152 0.400 0.68951
job_technician -0.188238 0.202466 -0.930 0.35251
job_unemployed -0.092510 0.101577 -0.911 0.36243
job_unknown NA NA NA NA
marital_divorced 0.074644 0.055456 1.346 0.17830
marital_married -0.093141 0.059002 -1.579 0.11443
marital_single NA NA NA NA
contact_cellular 0.569848 0.078767 7.235 4.67e-13 ***
contact_telephone 0.258543 0.063717 4.058 4.96e-05 ***
contact_unknown NA NA NA NA
month_apr -0.197775 0.099804 -1.982 0.04752 *
month_aug -0.736551 0.120874 -6.094 1.10e-09 ***
month_dec -0.075213 0.069968 -1.075 0.28239
month_feb -0.234373 0.093597 -2.504 0.01228 *
month_jan -0.382777 0.071551 -5.350 8.81e-08 ***
month_jul -0.767311 0.125853 -6.097 1.08e-09 ***
month_jun -0.370021 0.116247 -3.183 0.00146 **
month_mar 0.133735 0.074522 1.795 0.07272 .
month_may -0.874345 0.153207 -5.707 1.15e-08 ***
month_nov -0.453234 0.097923 -4.628 3.68e-06 ***
month_oct 0.027795 0.082265 0.338 0.73546
month_sep NA NA NA NA
age -0.118552 0.067598 -1.754 0.07947 .
default -0.008364 0.046032 -0.182 0.85581
balance 0.050770 0.043568 1.165 0.24389
housing -0.427248 0.054924 -7.779 7.32e-15 ***
loan -0.223600 0.049575 -4.510 6.47e-06 ***
day 0.043336 0.052518 0.825 0.40927
duration 2.297755 0.084221 27.282 < 2e-16 ***
campaign -0.126758 0.056745 -2.234 0.02550 *
pdays 0.028967 0.081983 0.353 0.72384
previous 0.037422 0.057749 0.648 0.51698
poutcome -0.238581 0.085005 -2.807 0.00501 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5094.5 on 3674 degrees of freedom
Residual deviance: 2913.4 on 3637 degrees of freedom
AIC: 2989.4
Number of Fisher Scoring iterations: 6
exp(coef(logit_model))
(Intercept) job_admin. `job_blue-collar` job_entrepreneur
1.1606695 0.9380984 0.7523638 0.9508632
job_housemaid job_management job_retired `job_self-employed`
0.8961602 0.9012989 1.0932085 0.9051162
job_services job_student job_technician job_unemployed
0.8280418 1.0437386 0.8284172 0.9116401
job_unknown marital_divorced marital_married marital_single
NA 1.0775007 0.9110653 NA
contact_cellular contact_telephone contact_unknown month_apr
1.7679981 1.2950423 NA 0.8205544
month_aug month_dec month_feb month_jan
0.4787624 0.9275457 0.7910669 0.6819651
month_jul month_jun month_mar month_may
0.4642599 0.6907195 1.1430893 0.4171354
month_nov month_oct month_sep age
0.6355692 1.0281852 NA 0.8882053
default balance housing loan
0.9916705 1.0520812 0.6523019 0.7996350
day duration campaign pdays
1.0442892 9.9518131 0.8809472 1.0293906
previous poutcome
1.0381311 0.7877449
confusionMatrix(as.factor(result), valid.df$y)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 478 197
yes 54 323
Accuracy : 0.7614
95% CI : (0.7345, 0.7869)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5213
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8985
Specificity : 0.6212
Pos Pred Value : 0.7081
Neg Pred Value : 0.8568
Prevalence : 0.5057
Detection Rate : 0.4544
Detection Prevalence : 0.6416
Balanced Accuracy : 0.7598
'Positive' Class : no
testing_data <- data.frame(train.df[,c(16,17,20,23,24,27,28,34,35,37,41,42)])
testing_data
testing_valid <- data.frame(valid.df[,c(16,17,20,23,24,27,28,34,35,37,41,42)])
testing_valid
ds_train <- data.frame(train.df[,c(34,35,37,41,42)])
ds_train
ds_valid <- data.frame(valid.df[,c(34,35,37,41,42)])
ds_valid
ds_test <- data.frame(test.df[,c(34,35,37,41,42)])
ds_test
confusionMatrix(as.factor(result), valid.df$y)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 468 193
yes 64 327
Accuracy : 0.7557
95% CI : (0.7286, 0.7814)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.51
Mcnemar's Test P-Value : 1.412e-15
Sensitivity : 0.8797
Specificity : 0.6288
Pos Pred Value : 0.7080
Neg Pred Value : 0.8363
Prevalence : 0.5057
Detection Rate : 0.4449
Detection Prevalence : 0.6283
Balanced Accuracy : 0.7543
'Positive' Class : no
confusionMatrix(as.factor(result), valid.df$y)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 475 231
yes 57 289
Accuracy : 0.7262
95% CI : (0.6982, 0.753)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4503
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8929
Specificity : 0.5558
Pos Pred Value : 0.6728
Neg Pred Value : 0.8353
Prevalence : 0.5057
Detection Rate : 0.4515
Detection Prevalence : 0.6711
Balanced Accuracy : 0.7243
'Positive' Class : no
par(pty = "s")
info <- roc(ds_train$y, logit_model$fitted.values,plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Logistic Regression",col="#377eb8",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = ds_train$y, predictor = logit_model$fitted.values, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Logistic Regression", col = "#377eb8", lwd = 3, print.auc = TRUE)
Data: logit_model$fitted.values in 1828 controls (ds_train$y no) < 1847 cases (ds_train$y yes).
Area under the curve: 87.24%
t1 <- Sys.time()
logit_model <- glm(y ~ .,data = ds_train, family = "binomial")
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.02214193 secs
knn.pred <- knn(train=ds_train[,-5],test = ds_valid[,-5], cl =
ds_train$y, k=1)
accuracy.df <- confusionMatrix(table(knn.pred, valid.df$y))
accuracy.df
Confusion Matrix and Statistics
knn.pred no yes
no 375 151
yes 157 369
Accuracy : 0.7072
95% CI : (0.6787, 0.7346)
No Information Rate : 0.5057
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4144
Mcnemar's Test P-Value : 0.7757
Sensitivity : 0.7049
Specificity : 0.7096
Pos Pred Value : 0.7129
Neg Pred Value : 0.7015
Prevalence : 0.5057
Detection Rate : 0.3565
Detection Prevalence : 0.5000
Balanced Accuracy : 0.7073
'Positive' Class : no
knn.pred <- knn(train=ds_train[,-5],test = ds_valid[,-5], cl =
ds_train$y, k=11)
accuracy.df <- confusionMatrix(table(knn.pred, valid.df$y))
accuracy.df
Confusion Matrix and Statistics
knn.pred no yes
no 432 123
yes 100 397
Accuracy : 0.788
95% CI : (0.7621, 0.8124)
No Information Rate : 0.5057
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5758
Mcnemar's Test P-Value : 0.1407
Sensitivity : 0.8120
Specificity : 0.7635
Pos Pred Value : 0.7784
Neg Pred Value : 0.7988
Prevalence : 0.5057
Detection Rate : 0.4106
Detection Prevalence : 0.5276
Balanced Accuracy : 0.7877
'Positive' Class : no
t1 <- Sys.time()
knn.pred <- knn(train=ds_train[,-5],test = ds_valid[,-5], cl =
ds_train$y, k=11)
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.1225719 secs
knn.pred <- knn(train=ds_train[,-5],test = ds_valid[,-5], cl =
ds_train$y, k=11,prob = TRUE)
scores.knn <- attr(knn.pred,"prob")
par(pty = "s")
info <- roc(ds_valid$y, scores.knn,plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for KNN Model",col="orange",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = ds_valid$y, predictor = scores.knn, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for KNN Model", col = "orange", lwd = 3, print.auc = TRUE)
Data: scores.knn in 532 controls (ds_valid$y no) < 520 cases (ds_valid$y yes).
Area under the curve: 44.61%
printcp(class.tree)
Classification tree:
rpart(formula = y ~ ., data = t.train.df, method = "class", control = rpart.control(maxdepth = 7),
minbucket = 50)
Variables actually used in tree construction:
[1] contact duration job month poutcome
Root node error: 1828/3675 = 0.49741
n= 3675
CP nsplit rel error xerror xstd
1 0.485777 0 1.00000 1.03118 0.016576
2 0.038840 1 0.51422 0.53939 0.014694
3 0.034737 2 0.47538 0.48195 0.014158
4 0.028446 4 0.40591 0.43545 0.013661
5 0.012582 5 0.37746 0.39880 0.013224
6 0.012035 6 0.36488 0.38348 0.013029
7 0.010000 9 0.32768 0.37309 0.012893
pred.tree <- predict(class.tree, v.valid.df, type = "class")
confusionMatrix(pred.tree,as.factor(v.valid.df$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 425 118
yes 107 402
Accuracy : 0.7861
95% CI : (0.7601, 0.8105)
No Information Rate : 0.5057
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5721
Mcnemar's Test P-Value : 0.505
Sensitivity : 0.7989
Specificity : 0.7731
Pos Pred Value : 0.7827
Neg Pred Value : 0.7898
Prevalence : 0.5057
Detection Rate : 0.4040
Detection Prevalence : 0.5162
Balanced Accuracy : 0.7860
'Positive' Class : no
cv.ct <- rpart(y~ ., data = t.train.df[c(2,9,11,12,16,17)], method = "class", cp = 0.00001, minsplit = 5, xval = 5)
printcp(cv.ct)
Classification tree:
rpart(formula = y ~ ., data = t.train.df[c(2, 9, 11, 12, 16,
17)], method = "class", cp = 1e-05, minsplit = 5, xval = 5)
Variables actually used in tree construction:
[1] contact duration job month poutcome
Root node error: 1828/3675 = 0.49741
n= 3675
CP nsplit rel error xerror xstd
1 4.8578e-01 0 1.00000 1.03173 0.016576
2 3.8840e-02 1 0.51422 0.53392 0.014646
3 3.4737e-02 2 0.47538 0.47046 0.014041
4 2.8446e-02 4 0.40591 0.45624 0.013890
5 1.2582e-02 5 0.37746 0.37199 0.012878
6 1.2035e-02 6 0.36488 0.35613 0.012661
7 5.4705e-03 9 0.32768 0.34081 0.012443
8 3.8293e-03 11 0.31674 0.33753 0.012395
9 3.5558e-03 14 0.30525 0.33425 0.012347
10 2.7352e-03 16 0.29814 0.33425 0.012347
11 2.1882e-03 18 0.29267 0.33042 0.012290
12 1.6411e-03 20 0.28829 0.32877 0.012265
13 1.3676e-03 21 0.28665 0.33589 0.012371
14 1.0941e-03 29 0.27516 0.33753 0.012395
15 9.1174e-04 48 0.25438 0.33643 0.012379
16 8.2057e-04 55 0.24781 0.34245 0.012467
17 7.2939e-04 68 0.23687 0.34300 0.012475
18 6.8381e-04 74 0.23249 0.34300 0.012475
19 5.4705e-04 78 0.22976 0.35886 0.012699
20 4.9234e-04 120 0.20678 0.37035 0.012856
21 4.1028e-04 157 0.18326 0.38950 0.013107
22 3.6470e-04 161 0.18162 0.39059 0.013121
23 3.2823e-04 197 0.16794 0.39059 0.013121
24 3.1260e-04 211 0.16247 0.40263 0.013272
25 2.7352e-04 218 0.16028 0.40536 0.013306
26 2.1882e-04 254 0.15044 0.41028 0.013366
27 2.0514e-04 264 0.14825 0.42013 0.013483
28 1.8235e-04 272 0.14661 0.42013 0.013483
29 1.3676e-04 295 0.14114 0.42779 0.013573
30 1.2157e-04 307 0.13950 0.42888 0.013586
31 1.0941e-04 316 0.13840 0.42888 0.013586
32 7.8149e-05 321 0.13786 0.42888 0.013586
33 1.0000e-05 328 0.13731 0.42888 0.013586
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
length(pruned.ct$frame$var[pruned.ct$frame$var == "<leaf>"])
[1] 21
prp(pruned.ct, type = 1, extra = 1, split.font = 1, varlen = -10)
Bad 'data' field in model 'call' (expected a data.frame or a matrix).
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
pred.tree <- predict(pruned.ct, v.valid.df, type = "class")
confusionMatrix(pred.tree,as.factor(v.valid.df$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 425 115
yes 107 405
Accuracy : 0.789
95% CI : (0.763, 0.8133)
No Information Rate : 0.5057
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5778
Mcnemar's Test P-Value : 0.6385
Sensitivity : 0.7989
Specificity : 0.7788
Pos Pred Value : 0.7870
Neg Pred Value : 0.7910
Prevalence : 0.5057
Detection Rate : 0.4040
Detection Prevalence : 0.5133
Balanced Accuracy : 0.7889
'Positive' Class : no
t1 <- Sys.time()
cv.ct <- rpart(y~ ., data = t.train.df[-c(1:3,5,6,8,10)], method = "class", cp = 0.00001, minsplit = 5, xval = 5)
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.103914 secs
tree.pred <- predict(pruned.ct, v.valid.df, type = "class")
table_data <- table(prediction = tree.pred,actual=v.valid.df$y)
# Accuracy Metric
sum(diag(table_data))/sum(table_data)
[1] 0.8108365
pred.tree <- predict(pruned.ct, v.valid.df, type = "prob")
par(pty = "s")
info <- roc(v.valid.df$y, pred.tree[,2],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Classification Tree Model",col="purple",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = v.valid.df$y, predictor = pred.tree[, 2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Classification Tree Model", col = "purple", lwd = 3, print.auc = TRUE)
Data: pred.tree[, 2] in 532 controls (v.valid.df$y no) < 520 cases (v.valid.df$y yes).
Area under the curve: 83.46%
pred.tree <- predict(class.tree, s_validdata, type = "class")
confusionMatrix(pred.tree,as.factor(s_validdata$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 442 152
yes 90 368
Accuracy : 0.77
95% CI : (0.7433, 0.7951)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5392
Mcnemar's Test P-Value : 8.81e-05
Sensitivity : 0.8308
Specificity : 0.7077
Pos Pred Value : 0.7441
Neg Pred Value : 0.8035
Prevalence : 0.5057
Detection Rate : 0.4202
Detection Prevalence : 0.5646
Balanced Accuracy : 0.7693
'Positive' Class : no
cv.ct <- rpart(y~ ., data = s_traindata, method = "class", cp = 0.00001, minsplit = 5, xval = 5)
printcp(cv.ct)
Classification tree:
rpart(formula = y ~ ., data = s_traindata, method = "class",
cp = 1e-05, minsplit = 5, xval = 5)
Variables actually used in tree construction:
[1] duration housing loan poutcome
Root node error: 1828/3675 = 0.49741
n= 3675
CP nsplit rel error xerror xstd
1 0.48577681 0 1.00000 1.00000 0.016581
2 0.03719912 1 0.51422 0.53446 0.014651
3 0.03227571 3 0.43982 0.45624 0.013890
4 0.00984683 4 0.40755 0.42505 0.013541
5 0.00382932 5 0.39770 0.40372 0.013285
6 0.00200584 9 0.38239 0.40263 0.013272
7 0.00164114 14 0.37090 0.40317 0.013279
8 0.00127644 21 0.35886 0.40481 0.013299
9 0.00123085 24 0.35503 0.40810 0.013339
10 0.00109409 28 0.35011 0.40810 0.013339
11 0.00082057 34 0.34354 0.41028 0.013366
12 0.00072939 44 0.33534 0.42177 0.013503
13 0.00063822 49 0.33151 0.43217 0.013623
14 0.00054705 55 0.32768 0.43217 0.013623
15 0.00036470 89 0.30908 0.44201 0.013735
16 0.00031260 138 0.28720 0.46444 0.013978
17 0.00027352 147 0.28392 0.46608 0.013995
18 0.00023445 213 0.26477 0.47867 0.014125
19 0.00021882 220 0.26313 0.48687 0.014207
20 0.00019893 225 0.26204 0.48687 0.014207
21 0.00018235 242 0.25821 0.48796 0.014218
22 0.00013676 257 0.25547 0.49234 0.014261
23 0.00001000 275 0.25274 0.49781 0.014314
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
length(pruned.ct$frame$var[pruned.ct$frame$var == "<leaf>"])
[1] 10
prp(pruned.ct, type = 1, extra = 1, split.font = 1, varlen = -10)
pred.tree <- predict(pruned.ct, s_validdata, type = "class")
confusionMatrix(pred.tree,as.factor(s_validdata$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 442 146
yes 90 374
Accuracy : 0.7757
95% CI : (0.7492, 0.8005)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5507
Mcnemar's Test P-Value : 0.0003433
Sensitivity : 0.8308
Specificity : 0.7192
Pos Pred Value : 0.7517
Neg Pred Value : 0.8060
Prevalence : 0.5057
Detection Rate : 0.4202
Detection Prevalence : 0.5589
Balanced Accuracy : 0.7750
'Positive' Class : no
t1 <- Sys.time()
cv.ct <- rpart(y~ ., data = s_traindata, method = "class", cp = 0.00001, minsplit = 5, xval = 5)
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.1059 secs
pred.tree <- predict(pruned.ct, s_validdata, type = "prob")
par(pty = "s")
info <- roc(s_validdata$y, pred.tree[,2],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Classification Tree Model",col="purple",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = s_validdata$y, predictor = pred.tree[, 2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Classification Tree Model", col = "purple", lwd = 3, print.auc = TRUE)
Data: pred.tree[, 2] in 532 controls (s_validdata$y no) < 520 cases (s_validdata$y yes).
Area under the curve: 84.79%
rf <- randomForest(as.factor(y) ~ ., data = s_traindata, ntree = 100,
mtry = 4, nodesize = 5, importance = TRUE)
rf
Call:
randomForest(formula = as.factor(y) ~ ., data = s_traindata, ntree = 100, mtry = 4, nodesize = 5, importance = TRUE)
Type of random forest: classification
Number of trees: 100
No. of variables tried at each split: 4
OOB estimate of error rate: 24.03%
Confusion matrix:
no yes class.error
no 1385 443 0.2423414
yes 440 1407 0.2382241
rf.pred <- predict(rf, s_validdata)
confusionMatrix(rf.pred, s_validdata$y)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 421 141
yes 111 379
Accuracy : 0.7605
95% CI : (0.7335, 0.786)
No Information Rate : 0.5057
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.5205
Mcnemar's Test P-Value : 0.06773
Sensitivity : 0.7914
Specificity : 0.7288
Pos Pred Value : 0.7491
Neg Pred Value : 0.7735
Prevalence : 0.5057
Detection Rate : 0.4002
Detection Prevalence : 0.5342
Balanced Accuracy : 0.7601
'Positive' Class : no
t1 <- Sys.time()
rf <- randomForest(as.factor(y) ~ ., data = s_traindata, ntree = 100,
mtry = 4, nodesize = 5, importance = TRUE)
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.445951 secs
par(pty = "s")
info <- roc(s_traindata$y, rf$votes[,1],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Random Forest Model",col="#4daf4a",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = s_traindata$y, predictor = rf$votes[, 1], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Random Forest Model", col = "#4daf4a", lwd = 3, print.auc = TRUE)
Data: rf$votes[, 1] in 1828 controls (s_traindata$y no) > 1847 cases (s_traindata$y yes).
Area under the curve: 83.45%
s_traindata$y <- as.factor(s_traindata$y)
set.seed(1)
boost <- boosting(y ~ ., data = s_traindata)
pred <- predict(boost, s_validdata)
confusionMatrix(as.factor(pred$class), as.factor(s_validdata$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 430 98
yes 102 422
Accuracy : 0.8099
95% CI : (0.7848, 0.8332)
No Information Rate : 0.5057
P-Value [Acc > NIR] : <2e-16
Kappa : 0.6198
Mcnemar's Test P-Value : 0.832
Sensitivity : 0.8083
Specificity : 0.8115
Pos Pred Value : 0.8144
Neg Pred Value : 0.8053
Prevalence : 0.5057
Detection Rate : 0.4087
Detection Prevalence : 0.5019
Balanced Accuracy : 0.8099
'Positive' Class : no
t1 <- Sys.time()
rf <- boost <- boosting(y ~ ., data = s_traindata)
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 29.59273 secs
par(pty = "s")
info <- roc(s_traindata$y, boost$votes[,1],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Boosted Tree",col="#4daf4a",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = s_traindata$y, predictor = boost$votes[, 1], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Boosted Tree", col = "#4daf4a", lwd = 3, print.auc = TRUE)
Data: boost$votes[, 1] in 1828 controls (s_traindata$y no) > 1847 cases (s_traindata$y yes).
Area under the curve: 89.64%
Based on the Accuracy and Confidence Interval, the following models seems convenient and will be used further. 1. KNN (with K=11) 2. Classification Tree(Pruned) 3. Random Forest Classification Model(Boosted)
Now we will implement these models on the test data with the relevant predictors
knn.pred <- knn(train=ds_train[,-5],test = ds_test[,-5], cl =
ds_train$y, k=11)
accuracy.df <- confusionMatrix(table(knn.pred, ds_test$y))
accuracy.df
Confusion Matrix and Statistics
knn.pred no yes
no 224 71
yes 60 206
Accuracy : 0.7665
95% CI : (0.7292, 0.8009)
No Information Rate : 0.5062
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5327
Mcnemar's Test P-Value : 0.3823
Sensitivity : 0.7887
Specificity : 0.7437
Pos Pred Value : 0.7593
Neg Pred Value : 0.7744
Prevalence : 0.5062
Detection Rate : 0.3993
Detection Prevalence : 0.5258
Balanced Accuracy : 0.7662
'Positive' Class : no
par(pty = "s")
info <- roc(ds_test$y, scores.knn,plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for KNN Model",col="orange",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = ds_test$y, predictor = scores.knn, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for KNN Model", col = "orange", lwd = 3, print.auc = TRUE)
Data: scores.knn in 284 controls (ds_test$y no) < 277 cases (ds_test$y yes).
Area under the curve: 41.84%
t1 <- Sys.time()
knn.pred <- knn(train=ds_train[,-5],test = ds_test[,-5], cl =
ds_train$y, k=11)
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.08148694 secs
cv.ct <- rpart(y~ ., data = s_traindata, method = "class", cp = 0.00001, minsplit = 5, xval = 5)
printcp(cv.ct)
Classification tree:
rpart(formula = y ~ ., data = s_traindata, method = "class",
cp = 1e-05, minsplit = 5, xval = 5)
Variables actually used in tree construction:
[1] duration housing loan poutcome
Root node error: 1828/3675 = 0.49741
n= 3675
CP nsplit rel error xerror xstd
1 0.48577681 0 1.00000 1.03611 0.016574
2 0.03719912 1 0.51422 0.51860 0.014509
3 0.03227571 3 0.43982 0.45952 0.013926
4 0.00984683 4 0.40755 0.41028 0.013366
5 0.00382932 5 0.39770 0.40427 0.013292
6 0.00200584 9 0.38239 0.39934 0.013231
7 0.00164114 14 0.37090 0.40317 0.013279
8 0.00127644 21 0.35886 0.39989 0.013238
9 0.00123085 24 0.35503 0.40098 0.013252
10 0.00109409 28 0.35011 0.40098 0.013252
11 0.00082057 34 0.34354 0.40153 0.013258
12 0.00072939 44 0.33534 0.40536 0.013306
13 0.00063822 49 0.33151 0.40919 0.013352
14 0.00054705 55 0.32768 0.40919 0.013352
15 0.00036470 89 0.30908 0.42560 0.013548
16 0.00031260 138 0.28720 0.45405 0.013867
17 0.00027352 147 0.28392 0.45569 0.013884
18 0.00023445 213 0.26477 0.45842 0.013914
19 0.00021882 220 0.26313 0.46718 0.014006
20 0.00019893 225 0.26204 0.46718 0.014006
21 0.00018235 242 0.25821 0.46772 0.014012
22 0.00013676 257 0.25547 0.47921 0.014130
23 0.00001000 275 0.25274 0.48414 0.014180
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
length(pruned.ct$frame$var[pruned.ct$frame$var == "<leaf>"])
[1] 10
prp(pruned.ct, type = 1, extra = 1, split.font = 1, varlen = -10)
pred.tree <- predict(pruned.ct, s_testdata, type = "class")
confusionMatrix(pred.tree,as.factor(s_testdata$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 228 83
yes 56 194
Accuracy : 0.7522
95% CI : (0.7143, 0.7874)
No Information Rate : 0.5062
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.5038
Mcnemar's Test P-Value : 0.02743
Sensitivity : 0.8028
Specificity : 0.7004
Pos Pred Value : 0.7331
Neg Pred Value : 0.7760
Prevalence : 0.5062
Detection Rate : 0.4064
Detection Prevalence : 0.5544
Balanced Accuracy : 0.7516
'Positive' Class : no
t1 <- Sys.time()
cv.ct <- rpart(y~ ., data = s_traindata, method = "class", cp = 0.00001, minsplit = 5, xval = 5)
pruned.ct <- prune(cv.ct, cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 0.1881721 secs
pred.tree <- predict(pruned.ct, s_testdata, type = "prob")
par(pty = "s")
info <- roc(s_testdata$y, pred.tree[,2],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Pruned Tree Model",col="purple",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = s_testdata$y, predictor = pred.tree[, 2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Pruned Tree Model", col = "purple", lwd = 3, print.auc = TRUE)
Data: pred.tree[, 2] in 284 controls (s_testdata$y no) < 277 cases (s_testdata$y yes).
Area under the curve: 84.05%
rf <- randomForest(as.factor(y) ~ ., data = s_traindata, ntree = 100,
mtry = 4, nodesize = 5, importance = TRUE)
rf.pred <- predict(rf, s_testdata)
confusionMatrix(rf.pred, s_testdata$y)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 220 85
yes 64 192
Accuracy : 0.7344
95% CI : (0.6958, 0.7705)
No Information Rate : 0.5062
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4682
Mcnemar's Test P-Value : 0.1013
Sensitivity : 0.7746
Specificity : 0.6931
Pos Pred Value : 0.7213
Neg Pred Value : 0.7500
Prevalence : 0.5062
Detection Rate : 0.3922
Detection Prevalence : 0.5437
Balanced Accuracy : 0.7339
'Positive' Class : no
pred <- predict(boost, s_testdata)
confusionMatrix(as.factor(pred$class), as.factor(s_testdata$y))
Confusion Matrix and Statistics
Reference
Prediction no yes
no 224 54
yes 60 223
Accuracy : 0.7968
95% CI : (0.7611, 0.8293)
No Information Rate : 0.5062
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5936
Mcnemar's Test P-Value : 0.6396
Sensitivity : 0.7887
Specificity : 0.8051
Pos Pred Value : 0.8058
Neg Pred Value : 0.7880
Prevalence : 0.5062
Detection Rate : 0.3993
Detection Prevalence : 0.4955
Balanced Accuracy : 0.7969
'Positive' Class : no
par(pty = "s")
info <- roc(s_traindata$y, boost$votes[,1],plot = TRUE,legacy.axes=TRUE,percent = TRUE, xlab="False Positive Percentage",ylab="True Positive Percentage" ,main = "ROC Curve for Boosted Tree",col="#4daf4a",lwd=3,print.auc=TRUE)
info
Call:
roc.default(response = s_traindata$y, predictor = boost$votes[, 1], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Positive Percentage", main = "ROC Curve for Boosted Tree", col = "#4daf4a", lwd = 3, print.auc = TRUE)
Data: boost$votes[, 1] in 1828 controls (s_traindata$y no) > 1847 cases (s_traindata$y yes).
Area under the curve: 89.64%
t1 <- Sys.time()
rf <- boost <- boosting(y ~ ., data = s_traindata)
t2 <- Sys.time()
time_taken <- t2-t1
time_taken
Time difference of 29.77382 secs